library(data.table)
library(dplyr)
library(tidyr)
library(Weighted.Desc.Stat)

dir.project <- './'
dir.data <- paste0(dir.project,'data/')
dir.raw <- paste0(dir.data,'raw/')
dir.generated <- paste0(dir.data,'generated/')

dat.in <- fread(paste0(dir.generated,"r/morg_data_stacked_90.csv")) %>% filter(year==1990)

# restrict to wage sample

dat <- dat.in %>% filter(wagesample == 1)

## Summary statistics, breakdown by cat3 and schooling

# generate highest grade completed
dat <- dat %>% mutate(highestgrade = ifelse(gradecp==1,gradeat,gradeat-1))

dat.stats <- dat %>% 
  mutate(schooling = ifelse(highestgrade<12,'less than highschool',
                            ifelse(highestgrade==12, 'highschool',
                            ifelse(highestgrade>12 & highestgrade<14, 'some college',
                            ifelse(highestgrade>=14 & highestgrade<=16, 'college',
                            ifelse(highestgrade>=17, 'more than college',NA))))))

# base summary statistics on pareto-imputed wages
dat.stats$hrwage16 <- dat.stats$hrwage16_pareto

sumstats <- dat.stats %>%  group_by(year,cat3) %>% 
  mutate(hrwage16.mean = weighted.mean(hrwage16,weight=earnwt),
         hrwage16.sd = sd(hrwage16),
         empl = sum(round(earnwt)),
         hrwage16.median = median(hrwage16)) %>% 
              ungroup() %>% group_by(year,cat3,schooling) %>%
            mutate(schoolcount = sum(round(earnwt))) %>%
              ungroup() %>% group_by(year) %>%
    mutate(empl.tot = sum(earnwt),empl.sh = empl/empl.tot,
           school.sh = schoolcount/empl) %>% 
  ungroup() %>% distinct(year,cat3,hrwage16.mean,hrwage16.median,hrwage16.sd,empl.sh,school.sh,schooling) %>% arrange(year,cat3,schooling) 

fwrite(sumstats,paste0(dir.generated,'r/morg_sumstats_table.csv'))

## Summary statistics, breakdown by schooling
sumstats.all <- dat.stats %>%  group_by(year) %>% 
  mutate(hrwage16.mean = weighted.mean(hrwage16,weight=earnwt),
         hrwage16.sd = sd(hrwage16),
         hrwage16.median = median(hrwage16),
              empl = sum(round(earnwt))) %>% 
              ungroup() %>% group_by(year,schooling) %>%
            mutate(schoolcount = sum(round(earnwt))) %>%
              ungroup() %>% group_by(year) %>%
    mutate(empl.tot = sum(earnwt),empl.sh = empl/empl.tot,
           school.sh = schoolcount/empl) %>% 
  ungroup() %>% distinct(year,hrwage16.mean,hrwage16.median,hrwage16.sd,empl.sh,school.sh,schooling) %>% arrange(year,schooling) 


fwrite(sumstats.all,paste0(dir.generated,'r/morg_sumstats_all_table.csv'))

## Mean income (weekly earnings * 50 weeks) and R and M share in labor income
# function for rounding to 2 digits
rnd2 <- function(number){
return(format(round(number,2), nsmall=2))
}



mean_inc <- rnd2(mean(dat$earnwke16)*50)
print(paste0('Mean income is: ',as.character(mean_inc)))

# shares computed based on weekly earnigns
total_weighted_inc <- sum(dat$earnwt*dat$hrwage16)
M_weighted_inc <- sum(dat$earnwt[dat$cat3=='svc']*dat$hrwage16[dat$cat3=='svc'])
R_weighted_inc <- sum(dat$earnwt[dat$cat3=='rt']*dat$hrwage16[dat$cat3=='rt'])
M_share_inc <- M_weighted_inc/total_weighted_inc
R_share_inc <- R_weighted_inc/total_weighted_inc

inc <- data.table(
    mean = mean_inc,
    R_share = R_share_inc,
    M_share = M_share_inc
)

fwrite(inc,'./data/generated/r/mean_inc.csv')

## sumstats for participation (wagesample == 1 corresponds to participant, drop other participants (e.g. part time), keep non-participants)
dat.part <- dat.in %>% filter((lfsr89 %in% c(1,2) & wagesample==1)|!lfsr89 %in% c(1,2)) %>%
    mutate(participant = ifelse(lfsr89 %in% c(1,2),1,0))

stats.part <- dat.part %>% filter(participant==1) %>% group_by(cat3) %>% summarise(count = n(), share = count/nrow(dat.part))

part <- data.table(
    M = NA,
    R = NA,
    C = NA,
    all = NA
)

part$M <- stats.part$share[stats.part$cat3=='svc']
part$R <- stats.part$share[stats.part$cat3=='rt']
part$C <- stats.part$share[stats.part$cat3=='abs']
part$all <- sum(stats.part$share)

fwrite(part,'./data/generated/r/part_share.csv')
